{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "90d43b56-97e5-45e2-8e67-4488ed31d2df",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Run PyTorchJob From Function\n",
    "\n",
    "In this Notebook we are going to create [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/).\n",
    "\n",
    "The PyTorchJob will run distributive training using [DistributedDataParallel strategy](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8bb6564-fde3-4c28-841c-012122643dd9",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Install Kubeflow Python SDKs\n",
    "\n",
    "You need to install PyTorch packages and Kubeflow SDKs to run this Notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d49f072e-2221-48bb-9f6d-561713d1a45c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch==2.1.2\n",
    "!pip install torchvision==0.19.1\n",
    "\n",
    "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
    "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9331a05-9127-4b3a-8077-31157e267827",
   "metadata": {},
   "source": [
    "## Create Train Script for CNN Model\n",
    "\n",
    "This is simple **Convolutional Neural Network (CNN)** model for recognizing different picture of clothing using [Fashion MNIST Dataset](https://github.com/zalandoresearch/fashion-mnist)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def train_pytorch_model(parameters):\n",
    "    import logging\n",
    "    import os\n",
    "\n",
    "    import torch\n",
    "    import torch.distributed as dist\n",
    "    import torch.nn.functional as F\n",
    "    from torch import nn\n",
    "    from torch.utils.data import DistributedSampler\n",
    "    from torchvision import datasets, transforms\n",
    "\n",
    "    logging.basicConfig(\n",
    "        format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
    "        datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n",
    "        level=logging.INFO,\n",
    "    )\n",
    "\n",
    "    # Create PyTorch CNN Model.\n",
    "    class Net(nn.Module):\n",
    "        def __init__(self):\n",
    "            super(Net, self).__init__()\n",
    "            self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
    "            self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
    "            self.fc1 = nn.Linear(4 * 4 * 50, 500)\n",
    "            self.fc2 = nn.Linear(500, 10)\n",
    "\n",
    "        def forward(self, x):\n",
    "            x = F.relu(self.conv1(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = F.relu(self.conv2(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = x.view(-1, 4 * 4 * 50)\n",
    "            x = F.relu(self.fc1(x))\n",
    "            x = self.fc2(x)\n",
    "            return F.log_softmax(x, dim=1)\n",
    "\n",
    "    # IF GPU is available, nccl dist backend is used. Otherwise, gloo dist backend is used.\n",
    "    if torch.cuda.is_available():\n",
    "        device = \"cuda\"\n",
    "        backend = \"nccl\"\n",
    "    else:\n",
    "        device = \"cpu\"\n",
    "        backend = \"gloo\"\n",
    "    \n",
    "    logging.info(f\"Using Device: {device}, Backend: {backend}\")\n",
    "\n",
    "    # Setup PyTorch DDP. Distributed environment will be set automatically by Training Operator.\n",
    "    dist.init_process_group(backend=backend)\n",
    "    Distributor = torch.nn.parallel.DistributedDataParallel\n",
    "    local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n",
    "    logging.info(\n",
    "        \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n",
    "            dist.get_world_size(),\n",
    "            dist.get_rank(),\n",
    "            local_rank,\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Attach model to the correct device.\n",
    "    device = torch.device(f\"{device}:{local_rank}\")\n",
    "    model = Net().to(device)\n",
    "    model = Distributor(model)\n",
    "    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
    "\n",
    "    # Get Fashion MNIST Dataset.\n",
    "    dataset = datasets.FashionMNIST(\n",
    "        \"./data\",\n",
    "        train=True,\n",
    "        download=True,\n",
    "        transform=transforms.Compose([transforms.ToTensor()]),\n",
    "    )\n",
    "\n",
    "    # Every PyTorchJob worker gets distributed sampler of dataset.\n",
    "    train_loader = torch.utils.data.DataLoader(\n",
    "        dataset,\n",
    "        batch_size=128,\n",
    "        sampler=DistributedSampler(dataset),\n",
    "    )\n",
    "\n",
    "    # Start Training.\n",
    "    logging.info(f\"Start training for RANK: {dist.get_rank()}. WORLD_SIZE: {dist.get_world_size()}\")\n",
    "\n",
    "    for epoch in range(int(parameters[\"NUM_EPOCHS\"])):\n",
    "        model.train()\n",
    "\n",
    "        for batch_idx, (data, target) in enumerate(train_loader):\n",
    "            # Attach tensors to the device.\n",
    "            data = data.to(device)\n",
    "            target = target.to(device)\n",
    "\n",
    "            optimizer.zero_grad()\n",
    "            output = model(data)\n",
    "            loss = F.nll_loss(output, target)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            if batch_idx % 10 == 0 and dist.get_rank() == 0:\n",
    "                logging.info(\n",
    "                    \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n",
    "                        epoch,\n",
    "                        batch_idx * len(data),\n",
    "                        len(train_loader.dataset),\n",
    "                        100.0 * batch_idx / len(train_loader),\n",
    "                        loss.item(),\n",
    "                    )\n",
    "                )\n",
    "    if dist.get_rank() == 0:\n",
    "        logging.info(\"Training is finished\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cfe8739-1f94-476a-80e3-dd6e3237d9ed",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-09-01T19:32:37.813779Z",
     "iopub.status.busy": "2022-09-01T19:32:37.812759Z",
     "iopub.status.idle": "2022-09-01T19:32:37.827050Z",
     "shell.execute_reply": "2022-09-01T19:32:37.825186Z",
     "shell.execute_reply.started": "2022-09-01T19:32:37.813690Z"
    }
   },
   "source": [
    "## Run Training Locally in the Notebook\n",
    "\n",
    "We are going to download Fashion MNIST Dataset and start local training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-10-08T13:58:29Z INFO     Using Device: cpu, Backend: gloo\n",
      "2024-10-08T13:58:29Z INFO     Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 26421880/26421880 [00:02<00:00, 9155631.80it/s] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 29515/29515 [00:00<00:00, 1364085.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4422102/4422102 [00:00<00:00, 8802674.51it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5148/5148 [00:00<00:00, 8424610.61it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "2024-10-08T13:58:33Z INFO     Start training for RANK: 0. WORLD_SIZE: 1\n",
      "2024-10-08T13:58:33Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3017\n",
      "2024-10-08T13:58:33Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2850\n",
      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2844\n",
      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2718\n",
      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2489\n",
      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2405\n",
      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2178\n",
      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=2.1755\n",
      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=2.1326\n",
      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=2.0784\n",
      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=1.9585\n",
      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=1.8107\n",
      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=1.6047\n",
      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=1.4722\n",
      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=1.3473\n",
      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=1.2142\n",
      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=1.1584\n",
      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=1.0055\n",
      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=0.9729\n",
      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=1.0776\n",
      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=1.1153\n",
      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9125\n",
      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=1.0451\n",
      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=1.0821\n",
      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.7935\n",
      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=1.0418\n",
      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8537\n",
      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.8402\n",
      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.9968\n",
      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.9956\n",
      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=1.0038\n",
      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8188\n",
      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=1.0646\n",
      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6951\n",
      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [43520/60000 (72%)]\tloss=0.9384\n",
      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.9681\n",
      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.7699\n",
      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.9631\n",
      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.9253\n",
      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.6612\n",
      "2024-10-08T13:58:45Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.9142\n",
      "2024-10-08T13:58:45Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.7794\n",
      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.8052\n",
      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.8630\n",
      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.7686\n",
      "2024-10-08T13:58:47Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7940\n",
      "2024-10-08T13:58:47Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.6830\n",
      "2024-10-08T13:58:47Z INFO     Training is finished\n"
     ]
    }
   ],
   "source": [
    "# Set dist env variables to run the above training locally on the Notebook.\n",
    "import os\n",
    "\n",
    "os.environ[\"RANK\"] = \"0\"\n",
    "os.environ[\"LOCAL_RANK\"] = \"0\"\n",
    "os.environ[\"WORLD_SIZE\"] = \"1\"\n",
    "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "os.environ[\"MASTER_PORT\"] = \"1234\"\n",
    "\n",
    "# Train Model locally in the Notebook.\n",
    "train_pytorch_model({\"NUM_EPOCHS\": \"1\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5aae47e3-be31-468e-8f38-89e1e2f1c764",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Start Distributive Training with PyTorchJob\n",
    "\n",
    "Before creating PyTorchJob, you have to create `TrainingClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n",
    "\n",
    "Kubeflow Training Operator automatically set the appropriate env variables (`MASTER_PORT`, `MASTER_ADDR`, `WORLD_SIZE`, `RANK`) for each PyTorchJob container.\n",
    "\n",
    "PyTorchJob will train model on 3 epochs with 3 PyTorch workers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from kubeflow.training import TrainingClient, constants\n",
    "\n",
    "# Start PyTorchJob Training.\n",
    "pytorchjob_name = \"train-pytorch\"\n",
    "\n",
    "# Since we set `job_kind = PyTorchJob` APIs are going to use PyTorchJob as a default Job kind.\n",
    "training_client = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)\n",
    "\n",
    "training_client.create_job(\n",
    "    name=pytorchjob_name,\n",
    "    train_func=train_pytorch_model,\n",
    "    parameters={\"NUM_EPOCHS\": \"3\"}, # Input parameters for the train function.\n",
    "    num_workers=2,  # How many PyTorch Nodes will be created.\n",
    "    num_procs_per_worker=2, # How many procs per node will be used (e.g. number of CPUs/GPUs in a single Node)\n",
    "    resources_per_worker={\"cpu\": \"2\"}\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e44c3ad7-62c4-4b58-b52a-15fd8746b772",
   "metadata": {},
   "source": [
    "### Check the PyTorchJob Status\n",
    "\n",
    "Use `TrainingClient()` APIs to get information about created PyTorchJob."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4141f6c2-c38f-4972-b68a-35d150ef7485",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PyTorchJob Status: False\n"
     ]
    }
   ],
   "source": [
    "print(f\"PyTorchJob Status: {training_client.is_job_running(name=pytorchjob_name)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42e10587-7ac2-45bf-9c4f-d418e1585974",
   "metadata": {},
   "source": [
    "### Get PyTorchJob Pod Names\n",
    "\n",
    "Since we used 3 workers, PyTorchJob will create 1 master pod and 2 worker pods to execute distributed training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "49b53308-a19b-45e8-942f-4333e727ee48",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['train-pytorch-master-0', 'train-pytorch-worker-0']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_client.get_job_pod_names(pytorchjob_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b91d332d-487c-4a95-937d-26ffb6199cda",
   "metadata": {
    "execution": {
     "iopub.status.busy": "2022-09-01T20:10:25.759950Z",
     "iopub.status.idle": "2022-09-01T20:10:25.760581Z",
     "shell.execute_reply": "2022-09-01T20:10:25.760353Z",
     "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z"
    },
    "tags": []
   },
   "source": [
    "### Get PyTorchJob Training Logs\n",
    "\n",
    "We can get the logs from the master pod.\n",
    "\n",
    "Every worker processes 20000 data samples on each epoch since we distribute 60000 samples across 3 workers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-08 13:25:08,740] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.\n",
      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] \n",
      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] *****************************************\n",
      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] *****************************************\n",
      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
      "2024-10-08T13:25:15Z INFO     Using Device: cpu, Backend: gloo\n",
      "2024-10-08T13:25:15Z INFO     Using Device: cpu, Backend: gloo\n",
      "2024-10-08T13:25:16Z INFO     Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
      "2024-10-08T13:25:16Z INFO     Distributed Training for WORLD_SIZE: 4, RANK: 1, LOCAL_RANK: 1\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
      "100%|██████████| 26421880/26421880 [00:02<00:00, 12700502.50it/s]\n",
      "100%|██████████| 26421880/26421880 [00:02<00:00, 12593356.31it/s]\n",
      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
      "100%|██████████| 29515/29515 [00:00<00:00, 212712.93it/s]\n",
      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "100%|██████████| 29515/29515 [00:00<00:00, 212353.88it/s]\n",
      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
      "100%|██████████| 4422102/4422102 [00:05<00:00, 744014.92it/s] \n",
      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
      "100%|██████████| 5148/5148 [00:00<00:00, 48197046.86it/s]t/s]\n",
      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "2024-10-08T13:25:27Z INFO     Start training for RANK: 0. WORLD_SIZE: 4\n",
      "100%|██████████| 4422102/4422102 [00:07<00:00, 581699.71it/s] \n",
      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "Using downloaded and verified file: ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "2024-10-08T13:25:28Z INFO     Start training for RANK: 1. WORLD_SIZE: 4\n",
      "2024-10-08T13:25:29Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3060\n",
      "2024-10-08T13:25:29Z INFO     Train Epoch: 0 [1280/60000 (8%)]\tloss=2.2977\n",
      "2024-10-08T13:25:30Z INFO     Train Epoch: 0 [2560/60000 (17%)]\tloss=2.2900\n",
      "2024-10-08T13:25:31Z INFO     Train Epoch: 0 [3840/60000 (25%)]\tloss=2.2840\n",
      "2024-10-08T13:25:31Z INFO     Train Epoch: 0 [5120/60000 (34%)]\tloss=2.2755\n",
      "2024-10-08T13:25:32Z INFO     Train Epoch: 0 [6400/60000 (42%)]\tloss=2.2699\n",
      "2024-10-08T13:25:33Z INFO     Train Epoch: 0 [7680/60000 (51%)]\tloss=2.2518\n",
      "2024-10-08T13:25:34Z INFO     Train Epoch: 0 [8960/60000 (59%)]\tloss=2.2496\n",
      "2024-10-08T13:25:34Z INFO     Train Epoch: 0 [10240/60000 (68%)]\tloss=2.2273\n",
      "2024-10-08T13:25:35Z INFO     Train Epoch: 0 [11520/60000 (76%)]\tloss=2.1986\n",
      "2024-10-08T13:25:36Z INFO     Train Epoch: 0 [12800/60000 (85%)]\tloss=2.1755\n",
      "2024-10-08T13:25:37Z INFO     Train Epoch: 0 [14080/60000 (93%)]\tloss=2.1046\n",
      "2024-10-08T13:25:37Z INFO     Train Epoch: 1 [0/60000 (0%)]\tloss=2.0498\n",
      "2024-10-08T13:25:38Z INFO     Train Epoch: 1 [1280/60000 (8%)]\tloss=1.9492\n",
      "2024-10-08T13:25:39Z INFO     Train Epoch: 1 [2560/60000 (17%)]\tloss=1.8004\n",
      "2024-10-08T13:25:39Z INFO     Train Epoch: 1 [3840/60000 (25%)]\tloss=1.6088\n",
      "2024-10-08T13:25:40Z INFO     Train Epoch: 1 [5120/60000 (34%)]\tloss=1.3772\n",
      "2024-10-08T13:25:41Z INFO     Train Epoch: 1 [6400/60000 (42%)]\tloss=1.2621\n",
      "2024-10-08T13:25:41Z INFO     Train Epoch: 1 [7680/60000 (51%)]\tloss=1.1353\n",
      "2024-10-08T13:25:42Z INFO     Train Epoch: 1 [8960/60000 (59%)]\tloss=1.0971\n",
      "2024-10-08T13:25:43Z INFO     Train Epoch: 1 [10240/60000 (68%)]\tloss=1.0772\n",
      "2024-10-08T13:25:44Z INFO     Train Epoch: 1 [11520/60000 (76%)]\tloss=1.0657\n",
      "2024-10-08T13:25:44Z INFO     Train Epoch: 1 [12800/60000 (85%)]\tloss=1.0127\n",
      "2024-10-08T13:25:45Z INFO     Train Epoch: 1 [14080/60000 (93%)]\tloss=0.9161\n",
      "2024-10-08T13:25:46Z INFO     Train Epoch: 2 [0/60000 (0%)]\tloss=1.3036\n",
      "2024-10-08T13:25:46Z INFO     Train Epoch: 2 [1280/60000 (8%)]\tloss=0.8902\n",
      "2024-10-08T13:25:47Z INFO     Train Epoch: 2 [2560/60000 (17%)]\tloss=0.9369\n",
      "2024-10-08T13:25:48Z INFO     Train Epoch: 2 [3840/60000 (25%)]\tloss=0.9562\n",
      "2024-10-08T13:25:49Z INFO     Train Epoch: 2 [5120/60000 (34%)]\tloss=0.8001\n",
      "2024-10-08T13:25:49Z INFO     Train Epoch: 2 [6400/60000 (42%)]\tloss=0.8546\n",
      "2024-10-08T13:25:50Z INFO     Train Epoch: 2 [7680/60000 (51%)]\tloss=0.8226\n",
      "2024-10-08T13:25:51Z INFO     Train Epoch: 2 [8960/60000 (59%)]\tloss=0.9489\n",
      "2024-10-08T13:25:52Z INFO     Train Epoch: 2 [10240/60000 (68%)]\tloss=0.8800\n",
      "2024-10-08T13:25:52Z INFO     Train Epoch: 2 [11520/60000 (76%)]\tloss=0.8957\n",
      "2024-10-08T13:25:53Z INFO     Train Epoch: 2 [12800/60000 (85%)]\tloss=0.8961\n",
      "2024-10-08T13:25:54Z INFO     Train Epoch: 2 [14080/60000 (93%)]\tloss=0.7958\n",
      "2024-10-08T13:25:54Z INFO     Training is finished\n",
      "2024-10-08T13:25:54Z INFO     Training is finished\n",
      "[2024-10-08 13:25:59,467] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 0.004681587219238281 seconds\n",
      "[2024-10-08 13:25:59,468] torch.distributed.elastic.rendezvous.dynamic_rendezvous: [WARNING] The node 'train-pytorch-master-0_7_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "logs, _ = training_client.get_job_logs(pytorchjob_name)\n",
    "\n",
    "print(logs[\"train-pytorch-master-0\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17b0ca43-1936-4708-b03b-3ab9ac2bbdea",
   "metadata": {},
   "source": [
    "## Delete PyTorchJob\n",
    "\n",
    "When PyTorchJob is finished, you can delete the resource."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "training_client.delete_job(pytorchjob_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9641e9f-551d-44d5-872b-002fffaedcef",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "training",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
